{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "import codecs\n",
    "import jieba as jb"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "path=\"F:\\\\Study\\\\data\\\\cluster_jieba\\\\training.csv\"\n",
    "pathstop=\"F:\\Study\\data\\cluster_jieba\\\\stopwords.txt\"\n",
    "train=pd.read_csv(path,header=None,encoding='utf8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   0                                                  1\n",
       "0  2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...\n",
       "1  2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
       "2  1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...\n",
       "3  2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...\n",
       "4  2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专..."
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label                                               text\n",
       "0      2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...\n",
       "1      2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。\n",
       "2      1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...\n",
       "3      2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...\n",
       "4      2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专..."
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.columns=['label','text']\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "#停用词\n",
    "def read_file(file_path):\n",
    "    f=codecs.open(file_path,mode='r',encoding='utf-8')\n",
    "    lines=[]\n",
    "    for line in f:\n",
    "        line=line.rstrip('\\n').rstrip('\\r')#删除文本最后的\n",
    "        lines.append(line)\n",
    "    return lines"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "stopwords=read_file(pathstop)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['\\ufeffbook    mark    8   8   ?    \\x04\\x10        畈m?\\x13罙    \\uf8f5\\x7f  ?  \\x04   \\x03\\x03   \\x04  \\x05   \\x01\\x01  Users   \\x04   \\x01\\x01  qing\\x07   \\x01\\x01  Desktop \\x06   \\x01\\x01  AI2017  \\x04   \\x01\\x01  code\\x0c',\n",
       " '   \\x01\\x01  Project_text\\x04   \\x01\\x01  Data',\n",
       " '   \\x01\\x01  stopwords.txt       \\x01\\x06  \\x10       ,   <   L   X   l   x   \\x08   \\x04\\x03  6b\\x06     \\x08   \\x04\\x03  Y?     \\x08   \\x04\\x03  j?     \\x08   \\x04\\x03  k譸     \\x08   \\x04\\x03  鉿€     \\x08   \\x04\\x03  墔, \\x02   \\x08   \\x04\\x03  \\x16? \\x02   \\x08   \\x04\\x03  Da? \\x02       \\x01\\x06  ?   ?   ?   ?   ?   \\x08\\x01  \\x18\\x01  (\\x01  \\x08    \\x04  A臼\\ue53a   \\x18   \\x01\\x02  \\x01       \\x1f\\x02      \\x1f\\x02          \\x01\\x05  \\x04   \\x03\\x03  \\x01   \\x08   \\x04\\x03  \\x06       \\x04   \\x03\\x03  ?  \\x08   \\x01\\t  file:///\\x0c',\n",
       " '   \\x01\\x01  Macintosh HD\\x08   \\x04\\x03   ?ht   \\x08    \\x04  A?e?   $   \\x01\\x01  E3973557-1C08-3BCA-A24D-4A8B70C53061\\x18   \\x01\\x02  ?   \\x01   ?  \\x01   ?  \\x01   \\x01   \\x01\\x01  /   3   \\x01\\x02  dnib    \\x01                   \\x03       txt????         \\x14\\x01  ?\\uf8f5\\uf8f5\\x01       \\x16   \\x04\\x10  ?       \\x05\\x10  8\\x01      \\x10\\x10  p\\x01      @\\x10  `\\x01      T\\x10  ?      U\\x10  ?      V\\x10  ?      \\x02   P\\x02      \\x05   ?      \\x10   ?      \\x11   \\x04\\x02      \\x12   ?      \\x13   ?          0\\x02      0   ?      \\x01?  ?      \\x11?          \\x12?  ?      \\x01?  ?      \\x10?  \\x04       \\x17?  x       \"?  \\\\\\x02      ']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "#分词\n",
    "def segment_text(each_row):\n",
    "    return  ''.join(word for word in jb.lcut(each_row['text']) if word not in stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['text_segmentation']=train.apply(segment_text,axis=1)#这是pd的apply函数，第一个参数是一个自定的函数，然后会根据axis将每一列或者行传入\n",
    "                                                          #第一个函数当中处理，将处理结果返回给train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "      <th>text_segmentation</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "      <td>合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "      <td>公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "      <td>公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "      <td>公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "      <td>该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   label                                               text  \\\n",
       "0      2  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...   \n",
       "1      2  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。   \n",
       "2      1  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...   \n",
       "3      2  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...   \n",
       "4      2  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...   \n",
       "\n",
       "                                   text_segmentation  \n",
       "0  合晟资产是一家专注于股票、债券等二级市场投资，为合格投资者提供专业资产管理服务的企业。公司业...  \n",
       "1  公司的主营业务为向中小微企业、个体工商户、农户等客户提供贷款服务，自设立以来主营业务未发生过变化。  \n",
       "2  公司立足于商业地产服务，致力于为商业地产开发、销售、运营全产业链提供一整套增值服务，业务覆盖...  \n",
       "3  公司经工商管理部门核准的经营范围为“投资咨询、经济信息咨询，企业管理咨询，品牌推广策划，公共...  \n",
       "4  该公司的主营业务为在中国境内(港、澳、台除外)开展保险代理销售，依托于自身的产品研究能力和专...  "
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "#特征整理\n",
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer\n",
    "\n",
    "vectorizer=CountVectorizer(min_df=5)\n",
    "tranformer=TfidfTransformer()\n",
    "train_tfidf=tranformer.fit_transform(vectorizer.fit_transform(train['text_segmentation'])).toarray()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>1637</th>\n",
       "      <th>1638</th>\n",
       "      <th>1639</th>\n",
       "      <th>1640</th>\n",
       "      <th>1641</th>\n",
       "      <th>1642</th>\n",
       "      <th>1643</th>\n",
       "      <th>1644</th>\n",
       "      <th>1645</th>\n",
       "      <th>1646</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1647 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   0     1     2     3     4     5     6     7     8     9     ...   1637  \\\n",
       "0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   \n",
       "1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   \n",
       "2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   \n",
       "3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   \n",
       "4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...    0.0   \n",
       "\n",
       "   1638  1639  1640  1641  1642  1643  1644  1645  1646  \n",
       "0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  \n",
       "\n",
       "[5 rows x 1647 columns]"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_tfidf=pd.DataFrame(data=train_tfidf)\n",
    "df_train_tfidf.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>1638</th>\n",
       "      <th>1639</th>\n",
       "      <th>1640</th>\n",
       "      <th>1641</th>\n",
       "      <th>1642</th>\n",
       "      <th>1643</th>\n",
       "      <th>1644</th>\n",
       "      <th>1645</th>\n",
       "      <th>1646</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1648 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1    2    3    4    5    6    7    8    9  ...    1638  1639  1640  \\\n",
       "0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "\n",
       "   1641  1642  1643  1644  1645  1646  label  \n",
       "0   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "1   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "2   0.0   0.0   0.0   0.0   0.0   0.0      1  \n",
       "3   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "4   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "\n",
       "[5 rows x 1648 columns]"
      ]
     },
     "execution_count": 39,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_train_tfidf['label'] = train['label']\n",
    "df_train_tfidf.head()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "path_out=\"F:\\\\Study\\\\data\\\\cluster_jieba\\\\\"\n",
    "df_train_tfidf.to_csv(path_out +'FE_train_tfidf.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>1638</th>\n",
       "      <th>1639</th>\n",
       "      <th>1640</th>\n",
       "      <th>1641</th>\n",
       "      <th>1642</th>\n",
       "      <th>1643</th>\n",
       "      <th>1644</th>\n",
       "      <th>1645</th>\n",
       "      <th>1646</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 1648 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     0    1    2    3    4    5    6    7    8    9  ...    1638  1639  1640  \\\n",
       "0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...     0.0   0.0   0.0   \n",
       "\n",
       "   1641  1642  1643  1644  1645  1646  label  \n",
       "0   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "1   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "2   0.0   0.0   0.0   0.0   0.0   0.0      1  \n",
       "3   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "4   0.0   0.0   0.0   0.0   0.0   0.0      2  \n",
       "\n",
       "[5 rows x 1648 columns]"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#聚类\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "train=df_train_tfidf\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train=train['label']\n",
    "x_train=train.drop(['label'],axis=1)\n",
    "\n",
    "normalize(x_train,norm=\"l2\",copy=False)\n",
    "from sklearn.preprocessing import normalize\n",
    "from sklearn.cluster import MiniBatchKMeans\n",
    "from sklearn.cluster import KMeans\n",
    "from scipy.sparse import csr_matrix\n",
    "x_train=csr_matrix(x_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "def K_cluster_analysis(K,X):\n",
    "    print(\"K-means begin with cluster: {}\".format(K))\n",
    "    \n",
    "    mb_kmeans=KMeans(n_clusters=K)\n",
    "    y_pred=mb_kmeans.fit_predict(X)\n",
    "    CH_score=metrics.calinski_harabaz_score(X.todense(),y_pred)\n",
    "    \n",
    "    print(\"CH_score:{}\".format(CH_score))\n",
    "    \n",
    "    return CH_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "K-means begin with cluster: 5\n",
      "CH_score:69.56591369459119\n",
      "K-means begin with cluster: 10\n",
      "CH_score:44.04552382000272\n",
      "K-means begin with cluster: 15\n",
      "CH_score:34.74226202846419\n",
      "K-means begin with cluster: 20\n",
      "CH_score:30.39340743971487\n",
      "K-means begin with cluster: 30\n",
      "CH_score:26.770888235719863\n",
      "K-means begin with cluster: 40\n",
      "CH_score:22.424471898668674\n",
      "K-means begin with cluster: 50\n",
      "CH_score:20.937766333625554\n"
     ]
    }
   ],
   "source": [
    "from sklearn import metrics #metrics中是各种评价指标，其中有队聚类适用的\n",
    "Ks=[5,10,15,20,30,40,50]\n",
    "CH_scores=[]\n",
    "for K in Ks:\n",
    "    ch=K_cluster_analysis(K,x_train)\n",
    "    CH_scores.append(ch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAD8CAYAAABn919SAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvIxREBQAAGrFJREFUeJzt3XuQlPWd7/H3dwaQixBuAxIQBwS8LQFlgihqAXNQRK6ZIdGNG+Lqslu7m9U9J5vVXE6STWUjyZaaquTospqEZOMqKojRhMhy8Q4yCCp4Q0DEADPjhYDKRZjv+ePXvTNDhpke6O6n+3k+r6qu7n66e/o7T9V85le/53cxd0dERIpfSdQFiIhIdijQRURiQoEuIhITCnQRkZhQoIuIxIQCXUQkJhToIiIxoUAXEYkJBbqISEx0yOeX9e3b18vLy/P5lSIiRW/9+vXvuntZW+/La6CXl5dTU1OTz68UESl6ZrYjk/epy0VEJCYU6CIiMaFAFxGJiTYD3czOMrONTW77zOwmM+ttZsvNbEvqvlc+ChYRkZa1Geju/rq7j3b30cAY4GNgCXAzsMLdhwMrUs9FRCQi7e1yqQS2uvsOYCawMHV8ITArm4WJiEj7tDfQrwb+K/W4v7vvBkjd92vpA2Y2z8xqzKymvr7+xCsVEZFWZRzoZtYJmAE80J4vcPcF7l7h7hVlZW2Oi2/RfffBXXed0EdFRBKjPS30K4EX3L029bzWzAYApO7rsl1c2uLF8J3vwNGjufoGEZHi155Av4bG7haAR4C5qcdzgaXZKupYVVVQWwvPPJOrbxARKX4ZBbqZdQUmA4ubHL4VmGxmW1Kv3Zr98oKpU6FzZ3jooVx9g4hI8cso0N39Y3fv4+5/bHLsPXevdPfhqfv3c1Vk9+5wxRUh0BsacvUtIiLFrWhmilZXwx/+AM8/H3UlIiKFqWgCfdo06NhR3S4iIsdTNIHesydMngwPPgjuUVcjIlJ4iibQIYx2eest2LAh6kpERApPUQX6zJlQWhpa6SIi0lxRBXqfPjBxorpdRERaUlSBDqHbZcsW2LQp6kpERApL0QX67NlgptEuIiLHKrpA798fLr1U/egiIscqukCHMMlo82Z47bWoKxERKRxFGeif+1y4V7eLiEijogz0gQPhoosU6CIiTRVloEMY7bJhA2zbFnUlIiKFoagDHdRKFxFJK9pALy+HMWMU6CIiaUUb6BBa6WvXws6dUVciIhK9og90CHuOiogkXVEH+ogRMHKkJhmJiECRBzqESUbPPAO7d0ddiYhItIo+0KuqwsqLS5ZEXYmISLSKPtDPPRfOPlujXUREij7QzUIrffVqqK+PuhoRkegUfaBD6EdvaIClS6OuREQkOrEI9FGjYOhQjXYRkWSLRaCbhVb6ihXwwQdRVyMiEo1YBDqEfvQjR+A3v4m6EhGRaMQm0D/7WTj9dHW7iEhyxSbQ06NdHn8c9u2LuhoRkfyLTaBDCPRDh+Cxx6KuREQk/2IV6BdfDAMGaJKRiCRTrAK9pARmz4bf/hY++ijqakRE8iujQDeznmb2oJm9ZmavmtlFZtbbzJab2ZbUfa9cF5uJ6mo4cACWLYu6EhGR/Mq0hf5jYJm7nw2MAl4FbgZWuPtwYEXqeeQuvRT69tVoFxFJnjYD3cx6AJcB9wC4+2F33wvMBBam3rYQmJWrItujQ4fQ7fLoo3DwYNTViIjkTyYt9KFAPfBzM9tgZnebWTegv7vvBkjd98thne1SVQUffhiGMIqIJEUmgd4BuAC4093PBz6iHd0rZjbPzGrMrKY+T8shTpoEvXpptIuIJEsmgf4O8I67r009f5AQ8LVmNgAgdV/X0ofdfYG7V7h7RVlZWTZqblPHjjBjBjzyCBw+nJevFBGJXJuB7u57gJ1mdlbqUCXwCvAIMDd1bC5QUIvXVlXB3r2wcmXUlYiI5EeHDN/3FeDXZtYJ2AZcR/hnsMjMrgfeBubkpsQTM3kydO8eul2mTIm6GhGR3Mso0N19I1DRwkuV2S0nezp3hmnTwl6jd94ZRr+IiMRZrGaKHqu6Gt57D558MupKRERyL9aBPmUKdO2qSUYikgyxDvSuXWHq1NDtcvRo1NWIiORWrAMdwmiXPXvg2WejrkREJLdiH+hXXQWnnKJJRiISf7EP9O7d4YorQqA3NERdjYhI7sQ+0CGMdnnnHVi3LupKRERyJxGBPn16WA5Ao11EJM4SEeg9e0JlZeh2cY+6GhGR3EhEoEPodtm+HTZujLoSEZHcSEygz5wJpaXqdhGR+EpMoPftCxMmhEBXt4uIxFFiAh3CJKM33oDNm6OuREQk+xIV6LNng5kmGYlIPCUq0E87DS65RP3oIhJPiQp0CKNdNm2C11+PuhIRkexKXKB/7nPhXt0uIhI3iQv0QYNg3DgFuojET+ICHcJolxdegG3boq5ERCR7EhvoAIsXR1uHiEg2JTLQhwyBCy5Qt4uIxEsiAx1CK33NmrCsrohIHCQ20Kurw726XUQkLhIb6CNGwJ/9mSYZiUh8JDbQIbTSn346bCItIlLsEh3oVVVh5cUlS6KuRETk5CU60M87D846S6NdRCQeEh3oZqGVvno1vPtu1NWIiJycRAc6hH70o0dh6dKoKxEROTmJD/TRo8NEI412EZFil/hAT3e7rFgBe/dGXY2IyIlLfKBD6Hb55BP4zW+irkRE5MRlFOhm9paZvWxmG82sJnWst5ktN7MtqfteuS01dz772bCsrrpdRKSYtaeFPtHdR7t7Rer5zcAKdx8OrEg9L0olJaHb5fe/h/37o65GROTEnEyXy0xgYerxQmDWyZcTnaoqOHQIHnss6kpERE5MpoHuwONmtt7M5qWO9Xf33QCp+365KDBfLr44bCKtSUYiUqw6ZPi+8e6+y8z6AcvN7LVMvyD1D2AewODBg0+gxPwoLYXZs2HhQvj4Y+jaNeqKRETaJ6MWurvvSt3XAUuAsUCtmQ0ASN3XHeezC9y9wt0rysrKslN1jlRXhzBftizqSkRE2q/NQDezbmbWPf0YuBzYBDwCzE29bS5Q9HMtL7sM+vTRaBcRKU6ZdLn0B5aYWfr997r7MjNbBywys+uBt4E5uSszPzp0gFmzYNEiOHgQOneOuiIRkcy1Gejuvg0Y1cLx94DKXBQVpepquOceWL4cpk+PuhoRkcxppugxJk2CT31Ko11EpPgo0I/RqRPMnBlWXzx8OOpqREQyp0BvQVVVWKhr1aqoKxERyZwCvQWXXw6nnqpuFxEpLgr0FnTuDNOmhb1GjxyJuhoRkcwo0I+jujpsS/fUU1FXIiKSGQX6cUyZAl26aJKRiBQPBfpxdOsGU6fC4sXQ0BB1NSIibVOgt6KqCvbsgWefjboSEZG2KdBbcdVVYVy6RruISDFQoLeiRw+44ooQ6O5RVyMi0joFehuqqmDnTli3LupKRERap0Bvw4wZYRVGjXYRkUKnQG9Dr15QWaluFxEpfAr0DFRXw7Zt8OKLUVciInJ8CvQMzJwJJSXqdhGRwqZAz0BZGUyYEAJd3S4iUqgU6BmqqoLXX4dXXom6EhGRlinQMzR7NpjBwoVRVyIi0jIFeoYGDIBrroEf/Qjuvz/qakRE/lSbm0RLo7vvhrffhi99Cfr1g4kTo65IRKSRWujt0KVL2Gv0zDNh1ix46aWoKxIRaaRAb6fevWHZMujeHa68MrTYRUQKgQL9BAweDL/7HXz0UdgI4/33o65IRESBfsJGjoSHH4atW2H6dDhwIOqKRCTpFOgnYcIE+M//hOeegz//czh6NOqKRCTJFOgnac4cuOOO0Fr/+7/XTFIRiY6GLWbBP/wD/OEP8MMfwsCB8M1vRl2RiCSRAj1LfvAD2LULvvUt+PSn4S//MuqKRCRpFOhZUlIC99wDdXUwbx707x/2JBURyRf1oWdRp05hRcZRo+Dzn4fnn4+6IhFJEgV6lnXvDo891thCf+ONqCsSkaTIONDNrNTMNpjZo6nnQ8xsrZltMbP7zaxT7sosLqedBr//fXg8ZQrs2RNtPSKSDO1pod8IvNrk+XzgdncfDnwAXJ/Nword8OGhpV5bG1rq+/dHXZGIxF1GgW5mg4CrgLtTzw2YBKQ3ZVsIzMpFgcVs7Fh44IGwF2l1NRw+HHVFIhJnmbbQ7wC+BjSknvcB9rr7kdTzd4CBLX3QzOaZWY2Z1dTX159UscVo6lT4j/+Axx+HG27QxCMRyZ02A93MpgF17r6+6eEW3tpiVLn7AnevcPeKsrKyEyyzuF13HXzve/CrX8Ett0RdjYjEVSbj0McDM8xsKtAZ6EFosfc0sw6pVvogYFfuyix+3/hGmE06f36YTfqVr0RdkYjETZstdHe/xd0HuXs5cDWw0t2/CKwCqlNvmwsszVmVMWAGP/lJ2BjjxhtD37qISDadzDj0fwb+t5m9SehTvyc7JcVXaSncey9cfDFcey088UTUFYlInJjn8SpdRUWF19TU5O37CtX778P48bB7Nzz1VFhbXUTkeMxsvbtXtPU+zRSNQHobu27dwjZ2O3dGXZGIxIECPSJnnBG2sdu/X9vYiUh2KNAj9JnPhI0x3nwTZs7UNnYicnIU6BGbOBF++Ut4+mn44he1jZ2InDgFegH4whfg9tthyZKw+5Fmk4rIidAGFwXippvCxKN/+7cw8ejrX4+6IhEpNgr0AjJ/fhjK+I1vhG3svvzlqCsSkWKiQC8gJSXws5+FJXdvuCFsknHllVFXJSLFQn3oBaZTJ3jooTDZaM4cWLcu6opEpFgo0AtQjx5hjHpZWdgc4803o65IRIqBAr1Apbexa2gIE4/q6qKuSEQKnQK9gI0YEbax27UrtNQ//DDqikSkkCnQC9yFF8KiRbBhQ9jG7pNPoq5IRAqVAr0ITJsGd90VumC0jZ2IHI+GLRaJG24IXS/f/naYePSv/xp1RSJSaBToReRb3wqzSX/wgxDu3/1uWLVRRATU5VJUzOCnP4V/+ie47z4YPjzsTbpnT9SViUghUKAXmQ4d4Ic/DGPTr7sO7rwTzjwTbrlFa6qLJJ0CvUgNGgT//u/w2mth4+n582HoUPj+9zW8USSpFOhFbtgw+PWv4cUXYcIE+OY3Q7D/+Mdw8GDU1YlIPinQY2LkyLD70Zo1YSekm24Kfex33w1HjkRdnYjkgwI9Zi68EP77v8Nt4ED4q7+Cc88NF1EbGqKuTkRySYEeU5WV8NxzsHQpnHIKXHMNnH8+PPqoJiaJxJUCPcbMYMaM0L9+773w8ccwfTpcfDGsWhV1dSKSbQr0BCgpCS30V16BBQtg506YNAkmT4bnn4+6OhHJFgV6gnTsGPrU33wTbrsNNm4Mfe6zZ8OmTVFXJyInS4GeQJ07wz/+I2zbBt/7HqxcGUbGXHstbN0adXUicqIU6AnWvXsYt759O3zta7B4MZx9NvzN34Q1Y0SkuCjQhd694dZbQ+v8r/86bFQ9bBh89avw7rtRVycimVKgy/8YMAB+8hN44w24+mq4/XYYMgS+8x3Yty/q6kSkLQp0+RPl5fDzn4cLpVOmhGV6hwyBH/0oDH0UkcLUZqCbWWcze97MXjSzzWb23dTxIWa21sy2mNn9ZtYp9+VKPp1zDjzwANTUwNixoZ992LCwwuPhw1FXJyLHyqSFfgiY5O6jgNHAFDMbB8wHbnf34cAHwPW5K1OiNGYM/O538MQTYanev/3bcPH0F7+A996LujoRSWsz0D1IL8jaMXVzYBLwYOr4QmBWTiqUgnHZZfDkkyHce/YM67H37QsjRsCXvhQ231i/XhtZi0Qloy3ozKwUWA8MA34KbAX2unt6Hb93gIE5qVAKilnoV7/8cnjmmbBezJo1sHw5/OpX4T2dO0NFRZi0NG5cuA0aFG3dIklg3o6VmsysJ7AE+L/Az919WOr46cBv3X1kC5+ZB8wDGDx48JgdO3Zko24pMO5hSYG1a0PAr1kTWuuHDoXXP/3pxnAfNy5043TtGm3NIsXCzNa7e0Vb72vXJtHuvtfMVgPjgJ5m1iHVSh8E7DrOZxYACwAqKiq0zl9MmcHgweE2Z044dvgwvPRSY8CvWRMmLwGUlobZqemAv/DCsH57icZdiZywNlvoZlYGfJIK8y7A44QLonOBh9z9PjO7C3jJ3f9faz+roqLCa2pqslS6FKP6+rAg2Jo1oTW/dm3jGPdevUKwp7tqxo4Nk55Eki7TFnomgf4ZwkXPUsJF1EXu/i9mNhS4D+gNbACudfdDrf0sBbocq6Eh7IuaDvg1a8L49/RmHGed1bwvfuTIsFG2SJJkLdCzSYEumdi/P4x9b9ofX1sbXuvaNfS/N+2qGajL8RJzCnSJDXfYsaN5wL/wQuPkpkGDmgf8mDHQpUu0NYtkU04uiopEwSwsR1BeDl/4Qjh26FDYiSkd8GvXwoOpWREdOsCoUY0BP25cmOFqFtVvIJIfaqFLbNTVNbbi164NF1/37w+v9e7dvC9+7NgwOUqkGKiFLonTr1/YM3X69PD86FF49dXmF1yXLWvcJPvss5uPjT/vPF1wleKmFrokyr594YJr07Hx9fXhtW7dwgzXpv3xAwZEW68IqIUu0qIePcIG2ZMmhefuYcemphdcb7utcT2awYObB/wFF4SlDUQKkQJdEs0Mhg4Nt2uuCccOHgwbaDe94LpoUXitY0cYPbr5BdehQ3XBVQqDulxEMrBnz59ecP3oo/Ba374h3MePDy3/MWPUFy/ZpXHoIjl09Chs3twY8M89Fy7AQujWmTABKitDwJ93nlrwcnIU6CJ5Vl8Pq1bBihXhtnVrON6/fwj2yspwKy+PtEwpQgp0kYjt2AErVzYG/J494fjQoY3hPnFiGG4p0hoFukgBcQ9dMulwX70a/vjH8NrIkY0Bf9lloctGpCkFukgBO3IENmxoDPinnw6ja0pLwyzWdMBfdBGcckrU1UrUFOgiReTgwXBhNR3w69aFC6+dO8MllzQG/AUXhNCXZFGgixSxffvChtzpgH/55XC8Z8/GETSVlWH5Ao2giT8FukiM1NY2H0GzfXs4PmBA8xE0gwdHW6fkhgJdJMa2b28+gqauLhwfNqz5CJq+faOtU7JDgS6SEO5hklM63J94onGf1lGjmo+gOfXUaGuVE6NAF0moI0fCipLpFvwzz4QNQTp0CEsUpAN+3Djo1CnqaiUTCnQRAeDAAXj22cYWfE1N2IS7a9fmI2hGj9YImkKlQBeRFu3dG7pl0i34zZvD8V69Qr97OuBHjNAImkKhQBeRjOzZ0/wC644d4fjAgc1H0AwaFG2dSaZAF5F2c4dt20Kwr1wZbukdnUaMaD6CpnfvaGtNEgW6iJy0hgbYtKn5CJoPPwxdMeef39iCv/TSsIWf5IYCXUSy7pNPwrIE6Rb8s8/C4cNhJ6dx4xpb8GPHagRNNinQRSTnPv44DItMt+DXrw/dNt26wZlnhrXg+/cPSwSnHze9lZWFfwbSOm0SLSI517UrTJ4cbgAffBCWBl69Gt56KyxZsGVLuD9woOWf0afP8QP/2H8G2qC7dQp0EcmaXr1g9uxwa8o99L3X1oZlCmprW76tXx9eT890PVaPHscP+2NvSZwVq0AXkZwzg+7dw23YsLbff+BAy8Hf9Ngrr4QFy95/v+Wf0bVr5i3/nj3jMeZegS4iBadLFzjjjHBry+HDYWhla63/7dvDht7vvhtG7hyrU6fmAd/aP4I+faCkJPu/czYo0EWkqHXqFCZBDRzY9nuPHoX33jt+l09dXZho9eKL4fEnn/zpzygpCRdz22r1R3HRV4EuIolRWhpCt1+/sJdra9zDRd7jdfmkb2+8Ee4PHmz55/TpE8L94Ydh+PDs/05NtRnoZnY68EvgNKABWODuPzaz3sD9QDnwFvB5d/8gd6WKiOSPWZgN27s3nHNO6+91h/37W7/g+6lP5b7mTFroR4D/4+4vmFl3YL2ZLQe+DKxw91vN7GbgZuCfc1eqiEhhMgsjcHr0yOyib6602bXv7rvd/YXU4/3Aq8BAYCawMPW2hcCsXBUpIiJta9e1WjMrB84H1gL93X03hNAH+mW7OBERyVzGgW5mpwIPATe5+3GG/bf4uXlmVmNmNfXpZdtERCTrMgp0M+tICPNfu/vi1OFaMxuQen0AUNfSZ919gbtXuHtFWVlZNmoWEZEWtBnoZmbAPcCr7n5bk5ceAeamHs8Flma/PBERyVQmo1zGA38BvGxmG1PHvg7cCiwys+uBt4E5uSlRREQy0Wagu/vTwPFWOajMbjkiInKiCnRFAhERaa+8bnBhZvXAjrx9YW70Bd6NuogCoXPRnM5HczofjU72XJzh7m2OKslroMeBmdVksnNIEuhcNKfz0ZzOR6N8nQt1uYiIxIQCXUQkJhTo7bcg6gIKiM5Fczofzel8NMrLuVAfuohITKiFLiISEwr0VpjZz8yszsw2NTnW28yWm9mW1H2vKGvMFzM73cxWmdmrZrbZzG5MHU/q+ehsZs+b2Yup8/Hd1PEhZrY2dT7uN7NOUdeaL2ZWamYbzOzR1PMkn4u3zOxlM9toZjWpYzn/W1Ggt+4XwJRjjt1M2NhjOLAi9TwJ0hudnAOMA/7OzM4luefjEDDJ3UcBo4EpZjYOmA/cnjofHwDXR1hjvt1I2C8hLcnnAmCiu49uMlwx538rCvRWuPuTwPvHHE7kxh7a6KQ5Dz5MPe2YujkwCXgwdTwx58PMBgFXAXennhsJPRetyPnfigK9/RK/sYc2OglSXQwbCUtHLwe2Anvd/UjqLe8Q/uklwR3A1wj7DgP0IbnnAsI/98fNbL2ZzUsdy/nfSiarLYr8j2M3OgkNsWRy96PAaDPrCSwBWtpKOPbDyMxsGlDn7uvNbEL6cAtvjf25aGK8u+8ys37AcjN7LR9fqhZ6+2W0sUccncxGJ3Hm7nuB1YRrCz3NLN1QGgTsiqquPBoPzDCzt4D7CF0td5DMcwGAu+9K3dcR/tmPJQ9/Kwr09kvkxh7a6KQ5MytLtcwxsy7A/yJcV1gFVKfelojz4e63uPsgdy8HrgZWuvsXSeC5ADCzbmbWPf0YuBzYRB7+VjSxqBVm9l/ABMJKabXAt4GHgUXAYFIbe7j7sRdOY8fMLgGeAl6msZ/064R+9CSej88QLmyVEhpGi9z9X8xsKKGV2hvYAFzr7oeiqzS/Ul0uX3X3aUk9F6nfe0nqaQfgXnf/vpn1Icd/Kwp0EZGYUJeLiEhMKNBFRGJCgS4iEhMKdBGRmFCgi4jEhAJdRCQmFOgiIjGhQBcRiYn/D7x3TJzDe9RPAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "#绘制不同K对应的聚类性能，找到最佳模型\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "\n",
    "plt.plot(Ks,np.array(CH_scores),'b-',label='CH_scores')\n",
    "\n",
    "index=np.unravel_index(np.argmax(CH_scores,axis=None),len(CH_scores))\n",
    "Best_K=Ks[index[0]]\n",
    "print(Best_K)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "#用最佳结果再次聚类\n",
    "mb_kmeans=KMeans(n_clusters=5)\n",
    "y_pred=mb_kmeans.fit_predict(x_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 0, 2, ..., 3, 4, 0])"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "#保存结果\n",
    "\n",
    "feat_name_Kmeans='Kmeans_'+str(Best_K)\n",
    "\n",
    "y=pd.Series(data=y_train,name='target')\n",
    "\n",
    "train_kmeans=pd.concat([pd.Series(name=feat_name_Kmeans,data=y_pred),y],axis=1)\n",
    "train_kmeans.to_csv(path_out+'FE_train_KMeans.csv',index=False,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
